In [1]:
from sklearn.datasets import load_boston
boston = load_boston()
In [11]:
X = boston.data
y = boston.target
names = boston.feature_names
In [3]:
len(X), type(X)
Out[3]:
In [12]:
dfX0 = pd.DataFrame(X, columns=names)
dfX = sm.add_constant(dfX0)
dfy = pd.DataFrame(y, columns=["MEDV"])
df = pd.concat([dfX, dfy], axis=1)
df.tail(2)
Out[12]:
In [ ]:
# sns.pairplot(df) #오래 걸려!
In [10]:
# sns.pairplot(df_all, diag_kind="kde", kind="reg")
# plt.show()
In [6]:
sns.jointplot("RM", "MEDV", data=df)
plt.show()
In [7]:
import statsmodels.api as sm
In [13]:
model = sm.OLS(df.ix[:, -1], df.ix[:, :-1])
result = model.fit()
print(result.summary())
In [14]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler(with_mean=False)
X_scaled = scaler.fit_transform(X)
X_scaled
Out[14]:
In [15]:
dfX0 = pd.DataFrame(X_scaled, columns=names)
dfX = sm.add_constant(dfX0)
dfy = pd.DataFrame(y, columns=["MEDV"])
df = pd.concat([dfX, dfy], axis=1)
df.tail(2)
Out[15]:
In [8]:
model = sm.OLS(df.ix[:, -1], df.ix[:, :-1])
result = model.fit()
print(result.summary())
In [9]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(df.ix[:, :-1], df.ix[:, -1])
model.intercept_, model.coef_
Out[9]:
In [17]:
sns.distplot(result.resid)
plt.show();
In [18]:
sns.distplot(df.MEDV)
Out[18]:
In [20]:
df.count()
Out[20]:
In [26]:
df2 = df.drop(df[df.MEDV >= df.MEDV.max()].index)
df2.count()
Out[26]:
In [28]:
model2 = sm.OLS(df2.ix[:, -1], df2.ix[:, :-1])
result2 = model2.fit()
print(result2.summary())
# 보스턴 집값에서 스케일링과 아웃라이어 제거했더니 0.778로 올라가고 JB도 줄었다.
In [30]:
model_anova = sm.OLS.from_formula("MEDV ~ CHAS", data=df2)
result_anova = model_anova.fit()
table_anova = sm.stats.anova_lm(result_anova)
table_anova
Out[30]:
In [41]:
model1 = LinearRegression()
model1.fit(df.ix[:, :-1], df.ix[:, -1])
model1.intercept_, model1.coef_
Out[41]:
In [32]:
model2 = LinearRegression()
model2.fit(df2.ix[:, :-1], df2.ix[:, -1])
model2.intercept_, model2.coef_
Out[32]:
In [33]:
from sklearn.cross_validation import cross_val_score
In [42]:
score = cross_val_score(model1, df.ix[:, :-1], df.ix[:, -1], cv=5)
score, score.mean(), score.std()
Out[42]:
In [39]:
score2 = cross_val_score(model2, df2.ix[:, :-1], df2.ix[:, -1], cv=5)
score2, score2.mean(), score2.std()
Out[39]:
In [45]:
#Log transform
df3 = df2.drop(["CRIM", "DIS", "LSTAT", "MEDV"], axis=1)
df3["LOGCRIM"] = np.log(df2.CRIM)
df3["LOGDIS"] = np.log(df2.DIS)
df3["LOGLSTAT"] = np.log(df2.LSTAT)
df3["MEDV"] = df2.MEDV
In [46]:
sns.jointplot("CRIM", "MEDV", data=df2)
Out[46]:
In [47]:
sns.jointplot("LOGCRIM", "MEDV", data=df3)
Out[47]:
In [48]:
sns.jointplot("DIS", "MEDV", data=df2)
Out[48]:
In [49]:
sns.jointplot("LOGDIS", "MEDV", data=df3)
Out[49]:
In [50]:
sns.jointplot("LSTAT", "MEDV", data=df2)
Out[50]:
In [51]:
sns.jointplot("LOGLSTAT", "MEDV", data=df3)
Out[51]:
In [52]:
model3 = sm.OLS(df3.ix[:, -1], df3.ix[:, :-1])
result3 = model3.fit()
print(result3.summary())
In [53]:
score3 = cross_val_score(LinearRegression(), df3.ix[:, :-1], df3.ix[:, -1], cv=5)
score3, score3.mean(), score3.std()
Out[53]:
In [54]:
#Multicolinearity
sns.heatmap(np.corrcoef(df3.T))
Out[54]:
In [56]:
df4 = df3.drop(["ZN", "INDUS", "AGE", "LOGCRIM", "RAD", "TAX"], axis=1)
In [57]:
model4 = sm.OLS(df4.ix[:, -1], df4.ix[:, :-1])
result4 = model4.fit()
print(result4.summary())
In [58]:
model4 = LinearRegression()
model4.fit(df4.ix[:, :-1], df4.ix[:, -1])
model4.intercept_, model4.coef_
Out[58]:
In [59]:
score4 = cross_val_score(LinearRegression(), df4.ix[:, :-1], df4.ix[:, -1], cv=5)
score4, score4.mean(), score4.std()
Out[59]:
In [60]:
sns.heatmap(np.corrcoef(df4.T), xticklabels=df4.columns, yticklabels=df4.columns, annot=True)
Out[60]: